# Load all necessary packages
library(data.table) # For faster data manipulation
library(tidyverse) # For data manipulation and visualization
library(sf) # For spatial data handling
library(leaflet) # For interactive maps
library(leaflet.extras) # For additional leaflet features
library(mapview) # For easier map visualization
library(tmap) # For thematic maps
library(tigris) # For US road networks
library(future) # For parallel processing
library(future.apply) # For parallel processing with apply functions
library(fixest) # For efficient fixed effects estimation
library(modelsummary) # For regression tables
library(patchwork) # For combining plots
library(MatchIt) # For matching
library(lubridate) # For date handling
# Create directories if they don't exist
if (!dir.exists("./data/tigris")) {
dir.create("./data/tigris", recursive = TRUE)
}
if (!dir.exists("./data/processed")) {
dir.create("./data/processed", recursive = TRUE)
}
if (!dir.exists("./imgs")) {
dir.create("./imgs", recursive = TRUE)
}
# Set custom cache directory
options(tigris_cache_dir = "./data/tigris")
# Configure tigris to use caching
options(tigris_use_cache = TRUE)
# Function to create buffers in batches with proper projection
create_buffers_in_batches <- function(sf_object, buffer_dist, batch_size = 500) {
message("Reprojecting data to a meter-based CRS...")
sf_object_projected <- st_transform(sf_object, 3310) # California Albers
n_features <- nrow(sf_object_projected)
n_batches <- ceiling(n_features / batch_size)
# Create empty list to store results
buffer_list <- vector("list", n_batches)
# Process in batches
for (i in 1:n_batches) {
start_idx <- (i-1) * batch_size + 1
end_idx <- min(i * batch_size, n_features)
cat(sprintf("Processing batch %d of %d (features %d to %d)\n",
i, n_batches, start_idx, end_idx))
# Extract batch
batch <- sf_object_projected[start_idx:end_idx, ]
# Create buffer
buffer_list[[i]] <- st_buffer(batch, dist = buffer_dist)
}
# Combine results
result <- do.call(rbind, buffer_list)
# Reproject back to original CRS
message("Reprojecting results back to original CRS...")
st_transform(result, st_crs(sf_object))
}
# Unified function to create both point and line buffers
create_all_buffers <- function(point_sf, line_sf, buffer_dist = 300) {
# Process point buffers
point_buffers <- point_sf %>%
mutate(construction_id = if("ID" %in% names(.)) ID else row_number(),
construction_type = "point") %>%
select(construction_id, construction_type, Start_Time, End_Time, geometry) %>%
rename(start_time = Start_Time, end_time = End_Time) %>%
create_buffers_in_batches(buffer_dist = buffer_dist, batch_size = 500)
# Process line buffers if provided
if(!missing(line_sf) && nrow(line_sf) > 0) {
# Calculate segment length once
line_sf_with_length <- line_sf %>%
st_transform(3310) %>%
mutate(segment_length_m = as.numeric(st_length(geometry))) %>%
st_transform(4326)
line_buffers <- line_sf_with_length %>%
mutate(construction_id = if("ID" %in% names(.)) ID else row_number() + nrow(point_sf),
construction_type = "line") %>%
select(construction_id, construction_type, Start_Time, End_Time,
segment_length_m, geometry) %>%
rename(start_time = Start_Time, end_time = End_Time) %>%
create_buffers_in_batches(buffer_dist = buffer_dist, batch_size = 250)
# Combine buffers
return(list(
combined_buffers = bind_rows(
select(point_buffers, -construction_type),
select(line_buffers, -construction_type, -segment_length_m)
),
point_buffers = point_buffers,
line_buffers = line_buffers,
line_data = line_sf_with_length
))
}
# Return just point buffers if no lines
return(list(
combined_buffers = point_buffers,
point_buffers = point_buffers
))
}
# Helper function to find accidents near constructions
find_accidents_near_construction <- function(accident_sf, buffer_results) {
# For point buffers
point_spatial_matches <- accident_sf %>%
mutate(accident_id = if("ID" %in% names(.)) ID else row_number(),
accident_severity = if("Severity" %in% names(.)) Severity else NA_integer_,
accident_time = Start_Time) %>%
select(accident_id, accident_severity, accident_time, geometry) %>%
st_join(buffer_results$point_buffers, join = st_within) %>%
filter(!is.na(construction_id) &
accident_time >= start_time &
accident_time <= end_time) %>%
select(accident_id, accident_severity, accident_time, construction_id, construction_type)
# For line buffers if they exist
if ("line_buffers" %in% names(buffer_results)) {
line_spatial_matches <- accident_sf %>%
mutate(accident_id = if("ID" %in% names(.)) ID else row_number(),
accident_severity = if("Severity" %in% names(.)) Severity else NA_integer_,
accident_time = Start_Time) %>%
select(accident_id, accident_severity, accident_time, geometry) %>%
st_join(buffer_results$line_buffers, join = st_within) %>%
filter(!is.na(construction_id) &
accident_time >= start_time &
accident_time <= end_time) %>%
select(accident_id, accident_severity, accident_time, construction_id,
construction_type, segment_length_m)
# Combine both types of matches
return(list(
point_matches = point_spatial_matches,
line_matches = line_spatial_matches,
all_matches = bind_rows(
point_spatial_matches,
select(line_spatial_matches, -segment_length_m)
)
))
}
# Return just point matches if no line buffers
return(list(
point_matches = point_spatial_matches,
all_matches = point_spatial_matches
))
}
# Helper function to analyze accidents near construction
analyze_accidents_near_construction <- function(spatial_matches, accident_sf, line_data) {
# Count construction sites per accident
construction_counts <- spatial_matches$all_matches %>%
st_set_geometry(NULL) %>%
group_by(accident_id) %>%
summarize(nearby_construction_count = n())
# Randomly select one construction site per accident
accident_construction_analysis <- spatial_matches$all_matches %>%
st_set_geometry(NULL) %>%
left_join(construction_counts, by = "accident_id") %>%
mutate(
temporal_overlap = 1,
is_near_active_construction = 1
) %>%
group_by(accident_id) %>%
slice_sample(n = 1) %>%
ungroup()
# Calculate accident rates for line segments
if ("line_matches" %in% names(spatial_matches) && !is.null(line_data)) {
construction_segment_accident_rates <- line_data %>%
mutate(construction_id = if("ID" %in% names(.)) ID else row_number()) %>%
left_join(
spatial_matches$line_matches %>%
st_set_geometry(NULL) %>%
group_by(construction_id) %>%
summarize(accident_count = n()),
by = "construction_id"
) %>%
mutate(
accident_count = replace_na(accident_count, 0),
accidents_per_km = accident_count / (segment_length_m / 1000)
)
} else {
construction_segment_accident_rates <- NULL
}
# Add construction info to all accidents
accidents_with_construction <- accident_sf %>%
mutate(accident_id = if("ID" %in% names(.)) ID else row_number()) %>%
left_join(
accident_construction_analysis %>%
select(accident_id, construction_id, nearby_construction_count, construction_type),
by = "accident_id"
) %>%
mutate(near_construction = ifelse(is.na(construction_id), 0, 1))
# Return results
return(list(
accident_construction_analysis = accident_construction_analysis,
construction_segment_accident_rates = construction_segment_accident_rates,
accidents_with_construction = accidents_with_construction,
total_accidents = nrow(accident_sf),
accidents_near_construction = nrow(accident_construction_analysis),
pct_near_construction = round(nrow(accident_construction_analysis)/nrow(accident_sf)*100, 2)
))
}
# NEW FUNCTION: Create analysis panel with improved spatial matching
create_analysis_panel <- function(df.const.sf, df.acc.sf, ca_roads, buffer_dist = 300) {
# Step 1: Create a more appropriate spatial unit - larger road chunks
# Dissolve/merge road segments by road name and type to create longer segments
cat("Simplifying road network...\n")
simplified_roads <- ca_roads %>%
group_by(FULLNAME, RTTYP) %>%
summarize(geometry = st_union(geometry), .groups = "drop") %>%
mutate(road_id = row_number(),
road_length_km = as.numeric(st_length(st_transform(geometry, 3310)))/1000)
# Step 2: Create construction buffers and find which roads they intersect
cat("Creating construction buffers and finding affected roads...\n")
const_buffers <- st_buffer(st_transform(df.const.sf, 3310), buffer_dist) %>%
st_transform(st_crs(ca_roads))
road_construction <- simplified_roads %>%
st_join(const_buffers %>%
select(ID, Start_Time, End_Time), join = st_intersects) %>%
filter(!is.na(ID)) %>%
mutate(
const_start = floor_date(Start_Time, "month"),
const_end = floor_date(End_Time, "month")
) %>%
st_set_geometry(NULL) %>%
select(road_id, FULLNAME, RTTYP, const_start, const_end, ID)
# Step 3: Create a panel with monthly observations for all roads
cat("Creating panel dataset...\n")
months <- seq(as.Date("2021-01-01"), as.Date("2021-12-01"), by = "month")
# Create base panel with all road-month combinations
panel_base <- expand.grid(
road_id = simplified_roads$road_id,
month = months
) %>% as_tibble()
# Add construction activity flags
cat("Adding construction treatment indicators...\n")
panel_with_construction <- panel_base %>%
left_join(
road_construction %>%
# Expand each construction to all months it spans
group_by(road_id, ID) %>%
do({
months_seq <- seq(from = .$const_start[1], to = .$const_end[1], by = "month")
data.frame(
road_id = .$road_id[1],
month = months_seq,
has_construction = 1,
const_id = .$ID[1]
)
}) %>%
ungroup() %>%
# Consolidate multiple constructions on same road-month
group_by(road_id, month) %>%
summarize(
has_construction = 1,
construction_count = n(),
.groups = "drop"
),
by = c("road_id", "month")
) %>%
mutate(has_construction = ifelse(is.na(has_construction), 0, has_construction))
# Step 4: Count accidents for each road segment by month
cat("Matching accidents to roads...\n")
# Buffer roads slightly to catch nearby accidents
road_buffers <- st_buffer(st_transform(simplified_roads, 3310), 50) %>%
st_transform(st_crs(df.acc.sf))
accident_counts <- df.acc.sf %>%
mutate(month = floor_date(Start_Time, "month")) %>%
st_join(road_buffers %>% select(road_id), join = st_intersects) %>%
filter(!is.na(road_id)) %>%
st_set_geometry(NULL) %>%
group_by(road_id, month) %>%
summarize(
accident_count = n(),
severe_accident_count = sum(Severity >= 3, na.rm = TRUE),
avg_severity = mean(Severity, na.rm = TRUE),
.groups = "drop"
)
# Step 5: Create final panel with all variables
cat("Finalizing panel dataset...\n")
panel_final <- panel_with_construction %>%
left_join(accident_counts, by = c("road_id", "month")) %>%
left_join(
simplified_roads %>%
st_set_geometry(NULL) %>%
select(road_id, road_length_km, FULLNAME, RTTYP),
by = "road_id"
) %>%
# Add road type classification
mutate(
road_type = case_when(
grepl("I", RTTYP) ~ "Interstate",
grepl("U", RTTYP) ~ "US Highway",
grepl("S", RTTYP) ~ "State Highway",
TRUE ~ "Other"
),
# Fill in missing accident counts with 0
accident_count = replace_na(accident_count, 0),
severe_accident_count = replace_na(severe_accident_count, 0),
avg_severity = replace_na(avg_severity, 0),
# Calculate per-km rates
accidents_per_km = accident_count / road_length_km,
severe_accidents_per_km = severe_accident_count / road_length_km,
# Extract time variables
year = year(month),
month_num = month(month),
is_winter = month_num %in% c(12, 1, 2),
is_summer = month_num %in% c(6, 7, 8)
)
# Add urban classification
cat("Adding urban/rural classification...\n")
urban_areas <- urban_areas(year = 2021)
# Check which roads intersect urban areas
urban_roads <- simplified_roads %>%
st_transform(st_crs(urban_areas)) %>%
st_join(urban_areas %>% select(UACE10, NAME10, UATYP10), join = st_intersects) %>%
mutate(
is_urban = !is.na(UACE10),
urban_type = ifelse(is.na(UATYP10), "Rural", UATYP10)
) %>%
st_set_geometry(NULL) %>%
select(road_id, is_urban, urban_type)
# Join urban classification
panel_final <- panel_final %>%
left_join(urban_roads, by = "road_id")
# Add weather data if available (average by month for simplicity)
if ("Temperature(F)" %in% names(df.acc.sf) && "Precipitation(in)" %in% names(df.acc.sf)) {
cat("Adding weather data...\n")
weather_data <- df.acc.sf %>%
mutate(month = floor_date(Start_Time, "month")) %>%
group_by(month) %>%
summarize(
avg_temperature = mean(`Temperature(F)`, na.rm = TRUE),
avg_precipitation = mean(`Precipitation(in)`, na.rm = TRUE),
pct_adverse_weather = mean(Weather_Condition %in%
c("Rain", "Snow", "Fog", "Thunderstorm"), na.rm = TRUE),
.groups = "drop"
) %>%
st_set_geometry(NULL)
panel_final <- panel_final %>%
left_join(weather_data, by = "month")
}
# Print diagnostic information
cat("\nSummary Statistics:\n")
cat("Total unique roads:", nrow(simplified_roads), "\n")
cat("Roads with construction:", length(unique(road_construction$road_id)), "\n")
cat("Road-months with construction:", sum(panel_final$has_construction), "\n")
cat("Roads with accidents:", length(unique(accident_counts$road_id)), "\n")
cat("Total accidents matched to roads:", sum(accident_counts$accident_count), "\n")
cat("Final panel dimensions:", nrow(panel_final), "x", ncol(panel_final), "\n")
return(list(
panel_data = panel_final,
simplified_roads = simplified_roads,
road_construction = road_construction,
accident_counts = accident_counts
))
}
# Run all main regression models
run_regression_models <- function(panel_data) {
# 1. Base DiD Model
model1 <- feols(
accident_count ~ has_construction | road_id + month,
data = panel_data
)
# 2. Extended Model with Urban/Rural Heterogeneity
model2 <- feols(
accident_count ~ has_construction + has_construction:is_urban | road_id + month,
data = panel_data
)
# 3. Full Model with additional controls
# Check if weather data is available
if (all(c("avg_temperature", "avg_precipitation", "pct_adverse_weather") %in% names(panel_data))) {
model3 <- feols(
accident_count ~ has_construction + has_construction:is_urban +
has_construction:is_winter + has_construction:is_summer +
avg_temperature + avg_precipitation + pct_adverse_weather |
road_id + month,
data = panel_data
)
} else {
model3 <- feols(
accident_count ~ has_construction + has_construction:is_urban +
has_construction:is_winter + has_construction:is_summer |
road_id + month,
data = panel_data
)
}
# 4. Accident rate model (normalized by segment length)
model4 <- feols(
accidents_per_km ~ has_construction + has_construction:is_urban | road_id + month,
data = panel_data
)
# 5. Severity model
model5 <- feols(
avg_severity ~ has_construction + has_construction:is_urban | road_id + month,
data = panel_data
)
return(list(
base_model = model1,
urban_het_model = model2,
full_model = model3,
rate_model = model4,
severity_model = model5,
all_models = list(model1, model2, model3, model4, model5)
))
}
# Helper functions for robustness checks
create_event_study_data <- function(panel_data) {
panel_data %>%
group_by(road_id) %>%
mutate(
first_construction = if(any(has_construction == 1)) min(month[has_construction == 1]) else NA,
rel_month = as.numeric(difftime(month, first_construction, units = "days")) / 30,
rel_period = case_when(
is.na(rel_month) ~ NA_character_,
rel_month <= -3 ~ "t-3+",
rel_month <= -2 ~ "t-2",
rel_month <= -1 ~ "t-1",
rel_month < 0 ~ "t-0",
rel_month < 1 ~ "t+0",
rel_month < 2 ~ "t+1",
rel_month < 3 ~ "t+2",
TRUE ~ "t+3+"
)
) %>%
ungroup() %>%
mutate(
rel_period = factor(rel_period,
levels = c("t-3+", "t-2", "t-1", "t-0", "t+0", "t+1", "t+2", "t+3+"))
)
}
run_placebo_tests <- function(panel_data, iterations = 100) {
placebo_results <- data.frame(iteration = 1:iterations, estimate = NA, std.error = NA)
for(i in 1:iterations) {
set.seed(i)
temp_data <- panel_data %>%
mutate(placebo_construction = sample(has_construction))
temp_model <- feols(
accident_count ~ placebo_construction | road_id + month,
data = temp_data
)
placebo_results$estimate[i] <- coef(temp_model)[1]
placebo_results$std.error[i] <- se(temp_model)[1]
}
return(placebo_results)
}
run_fe_variations <- function(panel_data) {
list(
"No FE" = feols(accident_count ~ has_construction, data = panel_data),
"Road FE" = feols(accident_count ~ has_construction | road_id, data = panel_data),
"Time FE" = feols(accident_count ~ has_construction | month, data = panel_data),
"Both FE" = feols(accident_count ~ has_construction | road_id + month, data = panel_data)
)
}
run_transform_variations <- function(panel_data) {
list(
"Linear" = feols(accident_count ~ has_construction | road_id + month, data = panel_data),
"Log" = feols(log(accident_count + 1) ~ has_construction | road_id + month, data = panel_data),
"Poisson" = fepois(accident_count ~ has_construction | road_id + month, data = panel_data)
)
}
# Run all robustness checks
run_robustness_checks <- function(panel_data, base_model) {
# Results list
robustness_results <- list()
# 1. Event study approach
panel_data_event <- create_event_study_data(panel_data)
event_model <- feols(
accident_count ~ i(rel_period, ref = "t-1") | road_id + month,
data = panel_data_event %>% filter(!is.na(rel_period))
)
robustness_results$event_model <- event_model
robustness_results$panel_data_event <- panel_data_event
# 2. Placebo tests
placebo_results <- run_placebo_tests(panel_data, 100)
robustness_results$placebo_results <- placebo_results
# 3. Fixed effects variations
fe_models <- run_fe_variations(panel_data)
robustness_results$fe_models <- fe_models
# 4. Transform outcome variables
transform_models <- run_transform_variations(panel_data)
robustness_results$transform_models <- transform_models
# 5. Add matching (simplified, assumes is_urban and road_type are factors)
matched_data <- matchit(
has_construction ~ road_type + road_length_km + is_urban,
data = panel_data %>%
filter(month == min(month)) %>%
mutate(
road_type = factor(road_type),
is_urban = factor(is_urban)
),
method = "nearest"
)
robustness_results$matched_data <- matched_data
# Return all results
return(robustness_results)
}
# Create static maps
create_static_maps <- function(df.acc.sf, df.const.sf, df.const.lines, ca_roads) {
# Accident map
ca_plot <- ggplot() +
geom_sf(data = ca_roads, color = "gray80", size = 0.1) +
geom_sf(data = df.acc.sf,
aes(color = factor(year)), alpha = 0.7, size = 0.5) +
scale_color_viridis_d(name = "Year") +
theme_minimal() +
labs(title = "California Road Accidents (2021)") +
theme(legend.position = "bottom")
# Construction map
ca_const_plot <- ggplot() +
geom_sf(data = ca_roads, color = "gray80", size = 0.1) +
geom_sf(data = df.const.sf,
aes(color = factor(year)), alpha = 0.7, size = 0.5) +
scale_color_viridis_d(name = "Year") +
theme_minimal() +
labs(title = "California Construction Sites (2021)") +
theme(legend.position = "bottom")
# Construction line segments map
lines_plot <- ggplot() +
geom_sf(data = ca_roads, color = "gray80", size = 0.1) +
geom_sf(data = df.const.lines,
color = "orange", size = 1) +
theme_minimal() +
labs(title = "California Construction Segments (2021)")
return(list(
accidents_map = ca_plot,
construction_map = ca_const_plot,
construction_lines_map = lines_plot
))
}
# Create heatmaps
create_heatmaps <- function(df.acc, df.const, ca_roads) {
# Accident heatmap
accident_heatmap <- ggplot() +
geom_sf(data = ca_roads, color = "gray80", size = 0.1) +
stat_density_2d(
data = df.acc %>% filter(!is.na(Start_Lat) & !is.na(Start_Lng)),
aes(x = Start_Lng, y = Start_Lat, fill = after_stat(density), alpha = after_stat(density)),
geom = "tile",
contour = FALSE,
h = 0.1,
n = 200
) +
scale_alpha_continuous(range = c(0, 0.9), guide = "none") +
scale_fill_gradientn(
colors = c("#0000FF", "#00FFFF", "#00FF00", "#FFFF00", "#FF0000"),
name = "Accident\nDensity"
) +
coord_sf(
xlim = c(-124.5, -114.5),
ylim = c(32.5, 42.5)
) +
theme_minimal() +
labs(
title = "Accident Density Heatmap in California (2021)",
x = NULL,
y = NULL
) +
theme(
legend.position = "right",
plot.title = element_text(face = "bold")
)
# Construction heatmap
construction_heatmap <- ggplot() +
geom_sf(data = ca_roads, color = "gray80", size = 0.1) +
stat_density_2d(
data = df.const %>% filter(!is.na(Start_Lat) & !is.na(Start_Lng)),
aes(x = Start_Lng, y = Start_Lat, fill = after_stat(density)),
geom = "tile",
contour = FALSE,
alpha = 0.7,
h = 0.1,
n = 200
) +
scale_alpha_continuous(range = c(0, 0.9), guide = "none") +
scale_fill_gradientn(
colors = c("yellow", "orange", "red"),
name = "Construction\nDensity"
) +
coord_sf(
xlim = c(-124.5, -114.5),
ylim = c(32.5, 42.5)
) +
theme_minimal() +
labs(
title = "Construction Density Heatmap in California (2021)",
x = NULL,
y = NULL
) +
theme(
legend.position = "right",
plot.title = element_text(face = "bold")
)
return(list(
accident_heatmap = accident_heatmap,
construction_heatmap = construction_heatmap
))
}
# Create interactive maps
create_interactive_maps <- function(df.acc.sf, df.const.sf, df.const.lines) {
# Accidents map
accident_map <- leaflet() %>%
addTiles() %>%
addHeatmap(data = df.acc.sf,
intensity = ~1,
radius = 8,
blur = 10) %>%
setView(lng = -119.4179, lat = 36.7783, zoom = 6) %>%
setMaxBounds(lng1 = -124.6, lat1 = 42.0,
lng2 = -114.1, lat2 = 32.5)
# Construction sites map
construction_map <- leaflet() %>%
addTiles() %>%
addHeatmap(data = df.const.sf,
intensity = ~1,
radius = 8,
blur = 10,
gradient = c("yellow", "orange", "red")) %>%
setView(lng = -119.4179, lat = 36.7783, zoom = 6) %>%
setMaxBounds(lng1 = -124.6, lat1 = 42.0,
lng2 = -114.1, lat2 = 32.5)
# Construction lines map
construction_lines_map <- leaflet() %>%
addTiles() %>%
addPolylines(data = df.const.lines,
color = "orange",
weight = 3,
opacity = 0.7,
popup = ~paste("ID:", ID, "<br>Duration:", duration)) %>%
setView(lng = -119.4179, lat = 36.7783, zoom = 6) %>%
setMaxBounds(lng1 = -124.6, lat1 = 42.0,
lng2 = -114.1, lat2 = 32.5)
return(list(
accident_map = accident_map,
construction_map = construction_map,
construction_lines_map = construction_lines_map
))
}
# Create buffer analysis visualizations
create_buffer_analysis_plots <- function(accidents_per_zone, ca_roads, analysis_results, severity_comparison, temporal_analysis) {
# Accident counts in construction zones
accident_zones_plot <- ggplot() +
geom_sf(data = ca_roads, color = "gray80", size = 0.1) +
geom_sf(data = accidents_per_zone %>% arrange(accident_count),
aes(color = accident_count), size = 1.5) +
scale_color_viridis_c() +
theme_minimal() +
labs(title = "Accident Counts within Construction Zones (2021)")
# Accident rate per construction segment
if (!is.null(analysis_results$construction_segment_accident_rates)) {
segment_rate_plot <- ggplot() +
geom_sf(data = ca_roads, color = "gray80", size = 0.1) +
geom_sf(data = analysis_results$construction_segment_accident_rates,
aes(color = accidents_per_km), size = 1.5) +
scale_color_viridis_c() +
theme_minimal() +
labs(title = "Accident Rate per Construction Segment (2021)",
color = "Accidents/km")
} else {
segment_rate_plot <- NULL
}
# Temporal analysis
temporal_plot <- ggplot(temporal_analysis, aes(x = month, y = accident_count)) +
geom_line() +
geom_point() +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Accidents Near Construction Over Time",
x = "Month",
y = "Accident Count")
# Severity comparison
severity_plot <- ggplot(severity_comparison, aes(x = Severity, y = count, fill = location)) +
geom_bar(stat = "identity", position = "dodge") +
scale_fill_brewer(palette = "Set1") +
theme_minimal() +
labs(title = "Accident Severity: Near vs. Away from Construction",
x = "Severity Level",
y = "Count",
fill = "Location")
return(list(
accident_zones_plot = accident_zones_plot,
segment_rate_plot = segment_rate_plot,
temporal_plot = temporal_plot,
severity_plot = severity_plot
))
}
# Create visualizations for new panel dataset
visualize_panel_data <- function(panel_results) {
# 1. Show roads with construction
road_construction_map <- ggplot() +
geom_sf(data = panel_results$simplified_roads, color = "gray90", size = 0.3) +
geom_sf(data = panel_results$simplified_roads %>%
filter(road_id %in% panel_results$road_construction$road_id),
color = "red", size = 1) +
theme_minimal() +
labs(title = "Roads with Construction Activities (2021)")
# 2. Show roads with accidents
road_accident_map <- ggplot() +
geom_sf(data = panel_results$simplified_roads, color = "gray90", size = 0.3) +
geom_sf(data = panel_results$simplified_roads %>%
filter(road_id %in% panel_results$accident_counts$road_id),
aes(color = "Roads with Accidents"), size = 0.8) +
geom_sf(data = panel_results$simplified_roads %>%
filter(road_id %in% panel_results$road_construction$road_id),
aes(color = "Roads with Construction"), size = 0.8) +
scale_color_manual(values = c("Roads with Accidents" = "blue",
"Roads with Construction" = "red")) +
theme_minimal() +
labs(title = "Roads with Accidents vs Construction (2021)",
color = "Road Type")
# 3. Accident counts by month with construction indicator
monthly_stats <- panel_results$panel_data %>%
group_by(month) %>%
summarize(
total_accidents = sum(accident_count),
construction_count = sum(has_construction),
.groups = "drop"
)
temporal_plot <- ggplot(monthly_stats) +
geom_col(aes(x = month, y = total_accidents), fill = "steelblue") +
geom_line(aes(x = month, y = construction_count * 10),
color = "red", size = 1.2) + # scaled for visualization
scale_y_continuous(
name = "Accident Count",
sec.axis = sec_axis(~./10, name = "Construction Count")
) +
theme_minimal() +
labs(title = "Monthly Accidents vs Construction Activities (2021)",
x = "Month")
# 4. Urban vs rural comparison
urban_rural_comparison <- panel_results$panel_data %>%
group_by(is_urban, has_construction) %>%
summarize(
total_accidents = sum(accident_count),
accident_rate = mean(accidents_per_km),
total_roads = n_distinct(road_id),
.groups = "drop"
) %>%
mutate(
road_category = case_when(
is_urban == TRUE & has_construction == 1 ~ "Urban with Construction",
is_urban == TRUE & has_construction == 0 ~ "Urban without Construction",
is_urban == FALSE & has_construction == 1 ~ "Rural with Construction",
is_urban == FALSE & has_construction == 0 ~ "Rural without Construction"
)
)
urban_rural_plot <- ggplot(urban_rural_comparison,
aes(x = road_category, y = accident_rate, fill = road_category)) +
geom_col() +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Accident Rates by Urban/Rural and Construction Status",
x = NULL,
y = "Accidents per km",
fill = "Road Category")
return(list(
road_construction_map = road_construction_map,
road_accident_map = road_accident_map,
temporal_plot = temporal_plot,
urban_rural_plot = urban_rural_plot
))
}
# Create robustness check visualizations
# Create robustness check visualizations
visualize_robustness_checks <- function(robustness_results, model1) {
# 1. Event study plot - keep as separate object, don't include in patchwork
coef_plot <- iplot(robustness_results$event_model,
main = "Event Study: Effect of Construction on Accidents",
xlab = "Months Relative to Construction Start",
ylab = "Effect on Accident Count")
# 2. Placebo plot
placebo_plot <- ggplot(robustness_results$placebo_results, aes(x = estimate)) +
geom_histogram(bins = 30, fill = "lightblue", color = "black") +
geom_vline(xintercept = coef(model1)[1], color = "red", linewidth = 1.5) +
theme_minimal() +
labs(title = "Distribution of Placebo Effects vs. Actual Effect",
x = "Estimated Effect", y = "Count",
caption = "Red line shows actual estimated effect")
# 3. Buffer sensitivity plot (example data - replace with actual results)
buffer_results <- data.frame(
buffer_dist = c(200, 300, 400, 500),
estimate = c(0.12, 0.15, 0.11, 0.09),
lower_ci = c(0.05, 0.08, 0.04, 0.01),
upper_ci = c(0.19, 0.22, 0.18, 0.17)
)
buffer_plot <- ggplot(buffer_results, aes(x = factor(buffer_dist), y = estimate)) +
geom_point(size = 3) +
geom_errorbar(aes(ymin = lower_ci, ymax = upper_ci), width = 0.2) +
theme_minimal() +
labs(title = "Sensitivity to Buffer Distance",
x = "Buffer Distance (meters)", y = "Estimated Effect on Accident Count")
# 4. Alternative FE specifications table
fe_comparison <- modelsummary(robustness_results$fe_models, stars = TRUE)
# 5. Different outcome transformations
transform_comparison <- modelsummary(robustness_results$transform_models, stars = TRUE)
# 6. Create a simplified panel with just standard ggplot objects
# Avoid using the iplot result in patchwork
robustness_panel <- placebo_plot + buffer_plot + plot_layout(ncol = 2)
return(list(
coef_plot = coef_plot, # Keep separate
placebo_plot = placebo_plot,
buffer_plot = buffer_plot,
fe_comparison = fe_comparison,
transform_comparison = transform_comparison,
robustness_panel = robustness_panel
))
}
Let’s load and process the data:
# Check if processed data exists to avoid reprocessing
if (file.exists("data/processed/df_acc.rds") &&
file.exists("data/processed/df_const.rds") &&
file.exists("data/processed/ca_roads.rds")) {
df.acc <- readRDS("data/processed/df_acc.rds")
df.const <- readRDS("data/processed/df_const.rds")
ca_roads <- readRDS("data/processed/ca_roads.rds")
df.acc.sf <- readRDS("data/processed/df_acc_sf.rds")
df.const.sf <- readRDS("data/processed/df_const_sf.rds")
df.const.lines <- readRDS("data/processed/df_const_lines.rds")
cat("Loaded preprocessed data\n")
} else {
# Load accident data
cat("Loading accident data...\n")
df.acc <- fread("data/us_accidents/US_accidents_March23.csv")[
# Filter date range of 2021
lubridate::year(as.Date(Start_Time)) == 2021 &
# And California
State == "CA"
][, `:=`(
# Add year, quarter, month columns
year = data.table::year(Start_Time),
quarter = data.table::quarter(Start_Time),
month = data.table::month(Start_Time),
# Calculate duration
duration = as.numeric(difftime(End_Time, Start_Time, units = "days"))
)] %>%
as_tibble()
# Load construction data
cat("Loading construction data...\n")
df.const <- fread("data/us_constructions/US_constructions_Dec21.csv")[
# Filter date range of 2021
lubridate::year(as.Date(Start_Time)) == 2021 &
# And California
State == "CA"
][, `:=`(
# Add year, quarter, month columns
year = year(Start_Time),
quarter = quarter(Start_Time),
month = month(Start_Time),
# Calculate duration
duration = as.numeric(difftime(End_Time, Start_Time, units = "days"))
)] %>%
as_tibble()
# Convert to SF objects
cat("Converting to SF objects...\n")
df.acc.sf <- df.acc %>%
filter(!is.na(Start_Lat) & !is.na(Start_Lng)) %>%
st_as_sf(coords = c("Start_Lng", "Start_Lat"), crs = 4326)
df.const.sf <- df.const %>%
filter(!is.na(Start_Lat) & !is.na(Start_Lng)) %>%
st_as_sf(coords = c("Start_Lng", "Start_Lat"), crs = 4326)
# Create construction line objects
cat("Creating construction linestrings...\n")
df.const.lines <- df.const %>%
filter(!is.na(Start_Lat) & !is.na(Start_Lng) & !is.na(End_Lat) & !is.na(End_Lng)) %>%
mutate(geometry = pmap(list(Start_Lng, Start_Lat, End_Lng, End_Lat),
function(start_lng, start_lat, end_lng, end_lat) {
coords <- matrix(c(start_lng, start_lat,
end_lng, end_lat),
ncol = 2, byrow = TRUE)
st_linestring(coords)
})) %>%
st_sf(crs = 4326)
# Load California roads
cat("Loading California roads...\n")
ca_roads <- primary_secondary_roads("CA", year = 2021)
# Save processed data for future use
saveRDS(df.acc, "data/processed/df_acc.rds")
saveRDS(df.const, "data/processed/df_const.rds")
saveRDS(ca_roads, "data/processed/ca_roads.rds")
saveRDS(df.acc.sf, "data/processed/df_acc_sf.rds")
saveRDS(df.const.sf, "data/processed/df_const_sf.rds")
saveRDS(df.const.lines, "data/processed/df_const_lines.rds")
}
## Loaded preprocessed data
Now let’s perform the spatial analysis:
# Check if we've already done the spatial analysis
if (file.exists("data/processed/buffer_results.rds") &&
file.exists("data/processed/spatial_matches.rds") &&
file.exists("data/processed/analysis_results.rds")) {
buffer_results <- readRDS("data/processed/buffer_results.rds")
spatial_matches <- readRDS("data/processed/spatial_matches.rds")
analysis_results <- readRDS("data/processed/analysis_results.rds")
cat("Loaded preprocessed spatial analysis results\n")
} else {
# Set up parallel processing
future::plan(future::multisession, workers = parallel::detectCores() - 1)
# Step 1: Create construction buffers (unified approach)
cat("Creating construction buffers...\n")
buffer_results <- create_all_buffers(df.const.sf, df.const.lines, buffer_dist = 300)
# Step 2: Find accidents near construction sites
cat("Finding accidents near construction...\n")
spatial_matches <- find_accidents_near_construction(df.acc.sf, buffer_results)
# Step 3: Analyze accidents near construction
cat("Analyzing accidents near construction...\n")
analysis_results <- analyze_accidents_near_construction(
spatial_matches,
df.acc.sf,
buffer_results$line_data
)
# Return to sequential processing
future::plan(future::sequential)
# Save results for future use
saveRDS(buffer_results, "data/processed/buffer_results.rds")
saveRDS(spatial_matches, "data/processed/spatial_matches.rds")
saveRDS(analysis_results, "data/processed/analysis_results.rds")
}
## Loaded preprocessed spatial analysis results
# Extract key accident statistics
accidents_per_zone <- analysis_results$construction_segment_accident_rates %>%
filter(accident_count > 0) %>%
select(construction_id, accident_count, geometry)
cat("Total accidents in dataset:", analysis_results$total_accidents, "\n")
## Total accidents in dataset: 341876
cat("Accidents near active construction:", analysis_results$accidents_near_construction, "\n")
## Accidents near active construction: 101660
cat("Percentage of accidents near active construction:",
analysis_results$pct_near_construction, "%\n")
## Percentage of accidents near active construction: 29.74 %
# Prepare severity comparison data
accidents_near <- analysis_results$accidents_with_construction %>%
filter(near_construction == 1) %>%
st_set_geometry(NULL) %>%
mutate(location = "Near construction") %>%
select(Severity, location)
accidents_away <- analysis_results$accidents_with_construction %>%
filter(near_construction == 0) %>%
st_set_geometry(NULL) %>%
mutate(location = "Away from construction") %>%
select(Severity, location)
severity_comparison <- bind_rows(accidents_near, accidents_away) %>%
group_by(Severity, location) %>%
summarize(count = n(), .groups = "drop") %>%
arrange(Severity, location)
# Prepare temporal analysis data
temporal_analysis <- spatial_matches$all_matches %>%
st_set_geometry(NULL) %>%
mutate(month = floor_date(accident_time, "month")) %>%
group_by(month) %>%
summarize(accident_count = n()) %>%
arrange(month)
Let’s create the visualizations:
# Create all visualizations
static_maps <- create_static_maps(df.acc.sf, df.const.sf, df.const.lines, ca_roads)
heatmaps <- create_heatmaps(df.acc, df.const, ca_roads)
interactive_maps <- create_interactive_maps(df.acc.sf, df.const.sf, df.const.lines)
buffer_plots <- create_buffer_analysis_plots(
accidents_per_zone,
ca_roads,
analysis_results,
severity_comparison,
temporal_analysis
)
# Display accident map
static_maps$accidents_map
ggsave("imgs/california_accidents.png", static_maps$accidents_map, width = 10, height = 8, dpi = 300)
# Display construction map
static_maps$construction_map
ggsave("imgs/california_construction.png", static_maps$construction_map, width = 10, height = 8, dpi = 300)
# Display accident heatmap
heatmaps$accident_heatmap
ggsave("imgs/accident_heatmap.png", heatmaps$accident_heatmap, width = 10, height = 8, dpi = 300)
# Display construction heatmap
heatmaps$construction_heatmap
ggsave("imgs/construction_heatmap.png", heatmaps$construction_heatmap, width = 10, height = 8, dpi = 300)
# Display interactive accident map
interactive_maps$accident_map
# Display interactive construction map
interactive_maps$construction_map